library(xml2)
library(httr) 
library(stringr)
library(dplyr)
library(readr)

# Load in data
plos_articles = read_csv("data/raw/plos_fully_coded.csv")

# Function to check if an XML has a supplementary materials section
#' @title Check for supplementary materials in XML
#' @description Fetches an XML document from a URL and checks if it contains a section with sec-type="supplementary-material"
#' @param url URL to fetch the XML document from
#' @return 1 if supplementary materials section exists, NA otherwise
#' @examples
#' check_supplement("https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0253127&type=manuscript")
check_supplement <- function(url) {
  tryCatch({
    # Get the XML content
    response <- httr::GET(url)
    
    # Check if the request was successful
    if (httr::status_code(response) == 200) {
      # Parse the XML content
      xml_content <- httr::content(response, as = "text", encoding = "UTF-8")
      xml_doc <- xml2::read_xml(xml_content)
      
      # Check if there's a section with sec-type="supplementary-material"
      supp_nodes <- xml2::xml_find_all(xml_doc, "//sec[@sec-type='supplementary-material']")
      
      # Return 1 if found, NA otherwise
      if (length(supp_nodes) > 0) {
        return(1)
      } else {
        return(NA)
      }
    } else {
      message("Failed to retrieve content from URL: ", url)
      return(NA)
    }
  }, error = function(e) {
    message("Error processing URL: ", url, "\nError: ", e$message)
    return(NA)
  })
}

# Process the dataframe
process_dataframe <- function(df) {
  # Create a new column for the supplementary material check
  df$has_supplement <- NA
  
  # Get rows where data_in_paper = 1
  rows_to_process <- which(df$data_in_paper == 1)
  
  if (length(rows_to_process) == 0) {
    message("No rows with data_in_paper = 1 found.")
    return(df)
  }
  
  message(paste("Processing", length(rows_to_process), "URLs..."))
  
  # Process each URL with a break every 5 requests
  for (i in seq_along(rows_to_process)) {
    row_idx <- rows_to_process[i]
    current_url <- df$url[row_idx]
    
    message(paste("Processing", i, "of", length(rows_to_process), ":", current_url))
    
    # Check if the URL is for a XML file
    if (str_detect(current_url, "\\.xml$") || 
        str_detect(current_url, "article/file\\?id=.*&type=manuscript")) {
      df$has_supplement[row_idx] <- check_supplement(current_url)
    } else {
      message("Skipping non-XML URL: ", current_url)
    }
    
    # Take a break every 5 requests
    if (i %% 5 == 0 && i < length(rows_to_process)) {
      message("Taking a 3-second break...")
      Sys.sleep(3)
    }
  }
  
  return(df)
}

# Assuming your dataframe is called plos_articles
plos_articles <- process_dataframe(plos_articles)

# To test with a small subset:
# test_df <- head(plos_articles, 10)
# test_df <- process_dataframe(test_df)


# Calculate percentage of papers with supplements among those with data_in_paper = 1
calculate_supplement_percentage <- function(df) {
  # Get the papers with data in paper
  papers_with_data <- df |> dplyr::filter(data_in_paper == 1)
  
  # Count papers with supplements
  papers_with_supplement <- papers_with_data |> 
    dplyr::filter(!is.na(has_supplement) & has_supplement == 1) |>
    nrow()
  
  # Total number of papers with data
  total_papers_with_data <- nrow(papers_with_data)
  
  # Calculate percentage
  percentage <- if(total_papers_with_data > 0) {
    (papers_with_supplement / total_papers_with_data) * 100
  } else {
    NA_real_
  }
  
  # Return a named list with the percentage and counts
  list(
    percentage = percentage,
    papers_with_supplement = papers_with_supplement,
    total_papers_with_data = total_papers_with_data
  )
}

# After running process_dataframe:
result <- calculate_supplement_percentage(plos_articles)
# 
cat(sprintf("%.2f%% of papers with data_in_paper have supplements (%d out of %d)\n",
            result$percentage,
            result$papers_with_supplement,
            result$total_papers_with_data))

saveRDS(plos_articles, "data/raw/added_supplements.rds")
